In [7]:
# encoding=utf8
from __future__ import division
import sys
reload(sys)
sys.setdefaultencoding('utf8')
import locale
import glob
import os.path
import requests
import tarfile
import io
dirname = 'aclImdb'
filename = 'aclImdb_v1.tar.gz'
# locale.setlocale(locale.LC_ALL, 'C')
# Convert text to lower-case and strip punctuation/symbols from words
def normalize_text(text):
norm_text = text.lower()
# Replace breaks with spaces
norm_text = norm_text.replace('<br />', ' ')
# Pad punctuation with spaces on both sides
for char in ['.', '"', ',', '(', ')', '!', '?', ';', ':']:
norm_text = norm_text.replace(char, ' ' + char + ' ')
return norm_text
if not os.path.isfile('aclImdb/alldata-id.txt'):
if not os.path.isdir(dirname):
if not os.path.isfile(filename):
# Download IMDB archive
url = 'http://ai.stanford.edu/~amaas/data/sentiment/' + filename
r = requests.get(url)
with open(filename, 'wb') as f:
f.write(r.content)
tar = tarfile.open(filename, mode='r')
tar.extractall()
tar.close()
# Concat and normalize test/train data
folders = ['train/pos', 'train/neg', 'test/pos', 'test/neg', 'train/unsup']
alldata = u''
for fol in folders:
temp = u''
output = fol.replace('/', '-') + '.txt'
# Is there a better pattern to use?
txt_files = glob.glob('/'.join([dirname, fol, '*.txt']))
for txt in txt_files:
with io.open(txt, 'r', encoding='utf-8') as t:
control_chars = [chr(0x85)]
t_clean = t.read()
for c in control_chars:
t_clean = t_clean.replace(unicode(c,errors='ignore') , ' ')
temp += t_clean
temp += "\n"
temp_norm = normalize_text(temp)
with io.open('/'.join([dirname, output]), 'w', encoding='utf-8') as n:
n.write(temp_norm)
alldata += temp_norm
with io.open('/'.join([dirname, 'alldata-id.txt']), 'w', encoding='utf-8') as f:
for idx, line in enumerate(alldata.splitlines()):
num_line = "_*{0} {1}\n".format(idx, line)
f.write(num_line)
In [2]:
import os.path
assert os.path.isfile("aclImdb/alldata-id.txt"), "alldata-id.txt unavailable"
In [13]:
import gensim
from gensim.models.doc2vec import TaggedDocument
from collections import namedtuple
SentimentDocument = namedtuple('SentimentDocument', 'words tags split sentiment')
alldocs = [] # will hold all docs in original order
with io.open('aclImdb/alldata-id.txt', encoding='utf-8') as alldata:
for line_no, line in enumerate(alldata):
print(line_no)
tokens = gensim.utils.to_unicode(line).split()
words = tokens[1:]
tags = [line_no] # `tags = [tokens[0]]` would also work at extra memory cost
try:
split = ['train','test','extra','extra'][line_no//25000] # 25k train, 25k test, 25k extra
sentiment = [1.0, 0.0, 1.0, 0.0, None, None, None, None][line_no//12500] # [12.5K pos, 12.5K neg]*2 then unknown
alldocs.append(SentimentDocument(words, tags, split, sentiment))
except IndexError:
print('error:'+str(line_no))
pass
train_docs = [doc for doc in alldocs if doc.split == 'train']
test_docs = [doc for doc in alldocs if doc.split == 'test']
doc_list = alldocs[:] # for reshuffling per pass
print('%d docs: %d train-sentiment, %d test-sentiment' % (len(doc_list), len(train_docs), len(test_docs)))
In [14]:
from gensim.models import Doc2Vec
import gensim.models.doc2vec
from collections import OrderedDict
import multiprocessing
cores = multiprocessing.cpu_count()
assert gensim.models.doc2vec.FAST_VERSION > -1, "this will be painfully slow otherwise"
simple_models = [
# PV-DM w/concatenation - window=5 (both sides) approximates paper's 10-word total window size
Doc2Vec(dm=1, dm_concat=1, size=100, window=5, negative=5, hs=0, min_count=2, workers=cores),
# PV-DBOW
Doc2Vec(dm=0, size=100, negative=5, hs=0, min_count=2, workers=cores),
# PV-DM w/average
Doc2Vec(dm=1, dm_mean=1, size=100, window=10, negative=5, hs=0, min_count=2, workers=cores),
]
# speed setup by sharing results of 1st model's vocabulary scan
simple_models[0].build_vocab(alldocs) # PV-DM/concat requires one special NULL word so it serves as template
print(simple_models[0])
for model in simple_models[1:]:
model.reset_from(simple_models[0])
print(model)
models_by_name = OrderedDict((str(model), model) for model in simple_models)
In [15]:
from gensim.test.test_doc2vec import ConcatenatedDoc2Vec
models_by_name['dbow+dmm'] = ConcatenatedDoc2Vec([simple_models[1], simple_models[2]])
models_by_name['dbow+dmc'] = ConcatenatedDoc2Vec([simple_models[1], simple_models[0]])
In [17]:
import numpy as np
import statsmodels.api as sm
from random import sample
# for timing
from contextlib import contextmanager
from timeit import default_timer
import time
@contextmanager
def elapsed_timer():
start = default_timer()
elapser = lambda: default_timer() - start
yield lambda: elapser()
end = default_timer()
elapser = lambda: end-start
def logistic_predictor_from_data(train_targets, train_regressors):
logit = sm.Logit(train_targets, train_regressors)
predictor = logit.fit(disp=0)
#print(predictor.summary())
return predictor
def error_rate_for_model(test_model, train_set, test_set, infer=False, infer_steps=3, infer_alpha=0.1, infer_subsample=0.1):
"""Report error rate on test_doc sentiments, using supplied model and train_docs"""
train_targets, train_regressors = zip(*[(doc.sentiment, test_model.docvecs[doc.tags[0]]) for doc in train_set])
train_regressors = sm.add_constant(train_regressors)
predictor = logistic_predictor_from_data(train_targets, train_regressors)
test_data = test_set
if infer:
if infer_subsample < 1.0:
test_data = sample(test_data, int(infer_subsample * len(test_data)))
test_regressors = [test_model.infer_vector(doc.words, steps=infer_steps, alpha=infer_alpha) for doc in test_data]
else:
test_regressors = [test_model.docvecs[doc.tags[0]] for doc in test_docs]
test_regressors = sm.add_constant(test_regressors)
# predict & evaluate
test_predictions = predictor.predict(test_regressors)
corrects = sum(np.rint(test_predictions) == [doc.sentiment for doc in test_data])
errors = len(test_predictions) - corrects
error_rate = float(errors) / len(test_predictions)
return (error_rate, errors, len(test_predictions), predictor)
In [18]:
from collections import defaultdict
best_error = defaultdict(lambda :1.0) # to selectively-print only best errors achieved
In [19]:
from random import shuffle
import datetime
alpha, min_alpha, passes = (0.025, 0.001, 20)
alpha_delta = (alpha - min_alpha) / passes
print("START %s" % datetime.datetime.now())
for epoch in range(passes):
shuffle(doc_list) # shuffling gets best results
for name, train_model in models_by_name.items():
# train
duration = 'na'
train_model.alpha, train_model.min_alpha = alpha, alpha
with elapsed_timer() as elapsed:
train_model.train(doc_list)
duration = '%.1f' % elapsed()
# evaluate
eval_duration = ''
with elapsed_timer() as eval_elapsed:
err, err_count, test_count, predictor = error_rate_for_model(train_model, train_docs, test_docs)
eval_duration = '%.1f' % eval_elapsed()
best_indicator = ' '
if err <= best_error[name]:
best_error[name] = err
best_indicator = '*'
print("%s%f : %i passes : %s %ss %ss" % (best_indicator, err, epoch + 1, name, duration, eval_duration))
if ((epoch + 1) % 5) == 0 or epoch == 0:
eval_duration = ''
with elapsed_timer() as eval_elapsed:
infer_err, err_count, test_count, predictor = error_rate_for_model(train_model, train_docs, test_docs, infer=True)
eval_duration = '%.1f' % eval_elapsed()
best_indicator = ' '
if infer_err < best_error[name + '_inferred']:
best_error[name + '_inferred'] = infer_err
best_indicator = '*'
print("%s%f : %i passes : %s %ss %ss" % (best_indicator, infer_err, epoch + 1, name + '_inferred', duration, eval_duration))
print('completed pass %i at alpha %f' % (epoch + 1, alpha))
alpha -= alpha_delta
print("END %s" % str(datetime.datetime.now()))
In [ ]:
# print best error rates achieved
for rate, name in sorted((rate, name) for name, rate in best_error.items()):
print("%f %s" % (rate, name))
In [ ]:
doc_id = np.random.randint(simple_models[0].docvecs.count) # pick random doc; re-run cell for more examples
print('for doc %d...' % doc_id)
for model in simple_models:
inferred_docvec = model.infer_vector(alldocs[doc_id].words)
print('%s:\n %s' % (model, model.docvecs.most_similar([inferred_docvec], topn=3)))
In [ ]:
import random
doc_id = np.random.randint(simple_models[0].docvecs.count) # pick random doc, re-run cell for more examples
model = random.choice(simple_models) # and a random model
sims = model.docvecs.most_similar(doc_id, topn=model.docvecs.count) # get *all* similar documents
print(u'TARGET (%d): «%s»\n' % (doc_id, ' '.join(alldocs[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(alldocs[sims[index][0]].words)))
In [ ]:
word_models = simple_models[:]
In [ ]:
import random
from IPython.display import HTML
# pick a random word with a suitable number of occurences
while True:
word = random.choice(word_models[0].index2word)
if word_models[0].vocab[word].count > 10:
break
# or uncomment below line, to just pick a word from the relevant domain:
#word = 'comedy/drama'
similars_per_model = [str(model.most_similar(word, topn=20)).replace('), ','),<br>\n') for model in word_models]
similar_table = ("<table><tr><th>" +
"</th><th>".join([str(model) for model in word_models]) +
"</th></tr><tr><td>" +
"</td><td>".join(similars_per_model) +
"</td></tr></table>")
print("most similar words for '%s' (%d occurences)" % (word, simple_models[0].vocab[word].count))
HTML(similar_table)
In [ ]:
# assuming something like
# https://word2vec.googlecode.com/svn/trunk/questions-words.txt
# is in local directory
# note: this takes many minutes
for model in word_models:
sections = model.accuracy('questions-words.txt')
correct, incorrect = len(sections[-1]['correct']), len(sections[-1]['incorrect'])
print('%s: %0.2f%% correct (%d of %d)' % (model, float(correct*100)/(correct+incorrect), correct, correct+incorrect))